# Subselect column after download -> TODO remove
#for (city in CITIES) {
#    tmp_df = read.csv(glue::glue("data/airbnb/{city}.csv"))
#    write.csv(tmp_df %>% select(DATASET_COLUMNS), glue::glue("data/airbnb/{city}.csv"), row.names=FALSE)
#}
# Define column groups
df_columns = colnames(df)
host_columns    = df_columns[grepl('Host', df_columns)]
review_columns  = df_columns[grepl('Review', df_columns)]
price_columns   = c('Price', 'Security.Deposit', 'Cleaning.Fee', 'Cancellation.Policy')
services_columns   = c('Instant_Bookable', 'Internet', 'Kitchen', 'Washer', 'Breakfast', 'Air_conditioning', 'Experiences.Offered')
accomodation_columns   = c('Property.Type', 'Room.Type', 'Accommodates', 'Bathrooms', 'Bedrooms', 'Beds', 'Bed.Type', 'Minimum.Nights')
summary(df[, c('City')])
##         City     
##  Amsterdam:3351  
##  Barcelona:3678  
##  Berlin   :2802  
##  London   :2816  
##  Roma     :3350  
##  Wien     :3271
summary(df[, host_columns])
##    Host.Since            Host.Response.Time Host.Response.Rate Host.ProfilePic
##  Min.   :1529   a few days or more:  501    Min.   :  0.0      Mode :logical  
##  1st Qu.:2057   within a day      : 3689    1st Qu.: 99.0      FALSE:36       
##  Median :2463   within a few hours: 4793    Median :100.0      TRUE :19232    
##  Mean   :2532   within an hour    :10285    Mean   : 94.2                     
##  3rd Qu.:2960                               3rd Qu.:100.0                     
##  Max.   :4709                               Max.   :100.0                     
##  Host.SuperHost  Host.verified  
##  Mode :logical   Mode :logical  
##  FALSE:16610     FALSE:7471     
##  TRUE :2658      TRUE :11797    
##                                 
##                                 
## 
summary(df[, review_columns])
##  Number.of.Reviews Review.Scores.Rating Review.Scores.Accuracy
##  Min.   :  1.00    Min.   : 20.00       Min.   : 2.000        
##  1st Qu.:  4.00    1st Qu.: 90.00       1st Qu.: 9.000        
##  Median : 11.00    Median : 95.00       Median :10.000        
##  Mean   : 24.26    Mean   : 92.59       Mean   : 9.506        
##  3rd Qu.: 29.00    3rd Qu.: 98.00       3rd Qu.:10.000        
##  Max.   :417.00    Max.   :100.00       Max.   :10.000        
##  Review.Scores.Cleanliness Review.Scores.Checkin Review.Scores.Communication
##  Min.   : 2.000            Min.   : 2.000        Min.   : 2.000             
##  1st Qu.: 9.000            1st Qu.: 9.000        1st Qu.:10.000             
##  Median :10.000            Median :10.000        Median :10.000             
##  Mean   : 9.367            Mean   : 9.667        Mean   : 9.662             
##  3rd Qu.:10.000            3rd Qu.:10.000        3rd Qu.:10.000             
##  Max.   :10.000            Max.   :10.000        Max.   :10.000             
##  Review.Scores.Location Review.Scores.Value
##  Min.   : 2.000         Min.   : 2.000     
##  1st Qu.: 9.000         1st Qu.: 9.000     
##  Median :10.000         Median : 9.000     
##  Mean   : 9.406         Mean   : 9.264     
##  3rd Qu.:10.000         3rd Qu.:10.000     
##  Max.   :10.000         Max.   :10.000
summary(df[, price_columns])
##      Price        Security.Deposit  Cleaning.Fee      Cancellation.Policy
##  Min.   :  9.00   Min.   :  0.00   Min.   :  0.00   strict      :6838    
##  1st Qu.: 45.00   1st Qu.:  0.00   1st Qu.:  0.00   moderate    :5228    
##  Median : 70.00   Median :  0.00   Median : 20.00   flexible    :3843    
##  Mean   : 89.87   Mean   : 93.91   Mean   : 24.64   moderate_new:1438    
##  3rd Qu.:110.00   3rd Qu.:150.00   3rd Qu.: 40.00   strict_new  :1038    
##  Max.   :999.00   Max.   :999.00   Max.   :375.00   flexible_new: 867    
##                                                     (Other)     :  16
summary(df[, services_columns])
##  Instant_Bookable Internet        Kitchen          Washer       
##  Mode :logical    Mode:logical   Mode :logical   Mode :logical  
##  FALSE:13234      TRUE:19268     FALSE:1680      FALSE:4729     
##  TRUE :6034                      TRUE :17588     TRUE :14539    
##                                                                 
##                                                                 
##  Breakfast       Air_conditioning Experiences.Offered
##  Mode :logical   Mode :logical    business:   54     
##  FALSE:16957     FALSE:14304      family  :   53     
##  TRUE :2311      TRUE :4964       none    :19095     
##                                   romantic:   26     
##                                   social  :   40
summary(df[, accomodation_columns])
##          Property.Type             Room.Type      Accommodates   
##  Apartment      :16071   Entire home/apt:12119   Min.   : 1.000  
##  House          : 1324   Private room   : 6972   1st Qu.: 2.000  
##  Bed & Breakfast:  810   Shared room    :  177   Median : 3.000  
##  Condominium    :  298                           Mean   : 3.312  
##  Loft           :  231                           3rd Qu.: 4.000  
##  Boat           :  135                           Max.   :16.000  
##  (Other)        :  399                                           
##    Bathrooms        Bedrooms           Beds                 Bed.Type    
##  Min.   :0.000   Min.   : 0.000   Min.   : 1.000   Airbed       :   16  
##  1st Qu.:1.000   1st Qu.: 1.000   1st Qu.: 1.000   Couch        :   46  
##  Median :1.000   Median : 1.000   Median : 2.000   Futon        :  115  
##  Mean   :1.208   Mean   : 1.368   Mean   : 2.039   Pull-out Sofa:  336  
##  3rd Qu.:1.000   3rd Qu.: 2.000   3rd Qu.: 2.000   Real Bed     :18755  
##  Max.   :8.000   Max.   :10.000   Max.   :16.000                        
##                                                                         
##  Minimum.Nights   
##  Min.   :  1.000  
##  1st Qu.:  1.000  
##  Median :  2.000  
##  Mean   :  2.865  
##  3rd Qu.:  3.000  
##  Max.   :200.000  
## 
# Aggregate data to do some plotting

agg_df = df %>% 
            select(City, Latitude, Longitude, Price) %>%
            group_by(City) %>%
            summarise(across(everything(), list(mean))) %>%
            mutate(lat = Latitude_1,
                    long = Longitude_1,
                   price = Price_1)
pal <- colorNumeric(
  palette = "viridis",
  domain = df$price)
## Warning: Unknown or uninitialised column: `price`.
labs <- lapply(seq(nrow(agg_df)), function(i) {
  paste0( '<p>', '<b>City:</b> ', agg_df[i, "City"][[1]], '<p></p>', 
          '<b>Avg Price per night:</b> ', round(agg_df[i, "price"], 2), '</p>' ) 
})


m<-leaflet(agg_df) %>% addProviderTiles('CartoDB.Positron') %>% 
    addCircleMarkers(lng= ~long, lat= ~lat, color= ~pal(price), radius = ~price/5, label = lapply(labs, htmltools::HTML)) %>%
  addLegend(pal = pal, values = ~price, title = "Average price",
    labFormat = labelFormat(suffix = "€"), opacity = 1)

fig = plot_ly(df, y = ~Price, color = ~City, type = "box")

p = bscols(m, fig)
p
atmp_df = df %>% filter(City == 'Amsterdam', Price < 250) %>% sample_n(1000)
batmp_df = df %>% filter(City == 'Barcelona', Price < 250) %>% sample_n(1000)
betmp_df = df %>% filter(City == 'Berlin', Price < 250) %>% sample_n(1000)
ltmp_df = df %>% filter(City == 'London', Price < 250) %>% sample_n(1000)
rtmp_df = df %>% filter(City == 'Roma', Price < 250) %>% sample_n(1000)
wtmp_df = df %>% filter(City == 'Wien', Price < 250) %>% sample_n(1000)

labs <- lapply(seq(nrow(atmp_df)), function(i) {
  paste0( '<p>', '<b>City:</b> ', atmp_df[i, "City"][[1]], '<p></p>', 
          '<b>Avg Price per night:</b> ', round(atmp_df[i, "Price"], 2), '</p>',
          '<b>Dist from center:</b> ', round(atmp_df[i, "dist_from_city_center"], 2), '</p>') 
})

ma<-leaflet(atmp_df) %>% addProviderTiles('CartoDB.Positron') %>% 
    addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label = lapply(labs, htmltools::HTML)) %>%
  addLegend(pal = pal, values = ~Price, title = "Average price",
    labFormat = labelFormat(suffix = "€"))

mba<-leaflet(batmp_df) %>% addProviderTiles('CartoDB.Positron') %>% 
    addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
  addLegend(pal = pal, values = ~Price, title = "Average price",
    labFormat = labelFormat(suffix = "€"))

p1 = bscols(ma, mba)

mbe<-leaflet(betmp_df) %>% addProviderTiles('CartoDB.Positron') %>% 
    addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
  addLegend(pal = pal, values = ~Price, title = "Average price",
    labFormat = labelFormat(suffix = "€"))

ml<-leaflet(ltmp_df) %>% addProviderTiles('CartoDB.Positron') %>% 
    addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
  addLegend(pal = pal, values = ~Price, title = "Average price",
    labFormat = labelFormat(suffix = "€"))

p2 = bscols(mbe, ml)

mr<-leaflet(rtmp_df) %>% addProviderTiles('CartoDB.Positron') %>% 
    addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
  addLegend(pal = pal, values = ~Price, title = "Average price",
    labFormat = labelFormat(suffix = "€"))

mw<-leaflet(wtmp_df) %>% addProviderTiles('CartoDB.Positron') %>% 
    addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
  addLegend(pal = pal, values = ~Price, title = "Average price",
    labFormat = labelFormat(suffix = "€"))

p3 = bscols(mr, mw)

p1
p2
p3

TODO - Compute distance_from_center column - show better stats/graphs on group of columns (as did above) - compute correlation matrix (qualitative and quantitative) to show what drives prices (and if it differs by city) - Exclude useless columns by correlation above or PCA - try clustering by city and clustering the entire dataset to show if changes - try and predict a price of something - evaluate model - Try and predict the city of a room based on other charateristics - try clusters with and w/out prices and see if prices variance is small in the clusters - Try and predict the review scores -